Step 1: Data cleaning
- Raw data inspection distribution
- Robust reconstruction of missing values using unsupervised RandomForest + multidimensional scaling + kmenas + rfImpute
- Dataset rebuilt with merged left and right sides by average
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
library(uwot)
## Warning: package 'uwot' was built under R version 3.6.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.6.2
library(skimr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(magrittr)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(gbm)
## Loaded gbm 2.1.5
data <- read.csv("~/My R environment/CALCTALUS/data_CALCTALUS.csv", encoding="UTF-8", row.names=1)
skim(data)
Data summary
| Name |
data |
| Number of rows |
180 |
| Number of columns |
33 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
1 |
| numeric |
32 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| SEX |
0 |
1 |
FALSE |
2 |
FEM: 93, MAL: 87 |
Variable type: numeric
| AGE |
0 |
1.00 |
3.28 |
1.81 |
1.0 |
2.00 |
3.0 |
5.00 |
7.0 |
▇▃▃▂▃ |
| Sexo |
0 |
1.00 |
1.48 |
0.50 |
1.0 |
1.00 |
1.0 |
2.00 |
2.0 |
▇▁▁▁▇ |
| TM1E |
3 |
0.98 |
50.59 |
3.69 |
39.5 |
48.00 |
50.0 |
53.00 |
61.0 |
▁▆▇▅▁ |
| TM1D |
2 |
0.99 |
50.87 |
3.67 |
39.0 |
48.00 |
50.5 |
53.00 |
60.5 |
▁▃▇▆▂ |
| TM2E |
3 |
0.98 |
39.23 |
3.01 |
31.0 |
37.00 |
39.0 |
41.50 |
46.5 |
▁▆▇▇▂ |
| TM2D |
2 |
0.99 |
39.38 |
2.89 |
32.0 |
37.00 |
39.0 |
41.38 |
46.0 |
▁▆▇▇▂ |
| TM3E |
3 |
0.98 |
29.10 |
2.30 |
21.5 |
27.00 |
29.0 |
30.50 |
35.0 |
▁▂▇▅▁ |
| TM3D |
2 |
0.99 |
29.18 |
2.27 |
21.0 |
27.12 |
29.0 |
31.00 |
36.0 |
▁▅▇▅▁ |
| TM3aE |
3 |
0.98 |
30.54 |
2.36 |
23.0 |
29.00 |
30.0 |
32.00 |
37.5 |
▁▃▇▅▁ |
| TM3aD |
2 |
0.99 |
30.55 |
2.34 |
22.0 |
29.00 |
30.5 |
32.00 |
38.0 |
▁▃▇▅▁ |
| TM4E |
3 |
0.98 |
31.15 |
2.43 |
24.0 |
30.00 |
31.0 |
33.00 |
39.0 |
▁▆▇▂▁ |
| TM4D |
2 |
0.99 |
31.38 |
2.49 |
24.0 |
30.00 |
31.0 |
33.00 |
38.5 |
▁▅▇▅▁ |
| TM5E |
3 |
0.98 |
26.45 |
2.36 |
21.5 |
25.00 |
26.0 |
28.00 |
33.0 |
▂▇▆▃▁ |
| TM5D |
2 |
0.99 |
26.20 |
2.34 |
21.0 |
24.42 |
26.0 |
28.00 |
32.5 |
▂▇▆▆▁ |
| CM1E |
5 |
0.97 |
75.33 |
5.39 |
57.0 |
72.00 |
75.0 |
79.00 |
91.0 |
▁▃▇▅▁ |
| CM1D |
5 |
0.97 |
75.63 |
5.61 |
57.0 |
72.00 |
76.0 |
79.00 |
92.5 |
▁▃▇▃▁ |
| CM1aE |
5 |
0.97 |
70.73 |
5.26 |
53.0 |
67.25 |
70.0 |
73.75 |
86.0 |
▁▃▇▅▁ |
| CM1aD |
5 |
0.97 |
70.41 |
5.26 |
53.0 |
67.00 |
70.0 |
74.00 |
86.0 |
▁▃▇▅▁ |
| CM2E |
5 |
0.97 |
39.77 |
3.16 |
29.0 |
37.50 |
39.5 |
42.00 |
48.0 |
▁▂▇▇▂ |
| CM2D |
4 |
0.98 |
39.76 |
3.04 |
29.5 |
38.00 |
39.5 |
42.00 |
48.0 |
▁▂▇▆▁ |
| CM4E |
7 |
0.96 |
40.83 |
3.87 |
33.0 |
38.00 |
40.0 |
44.00 |
54.0 |
▅▇▆▂▁ |
| CM4D |
5 |
0.97 |
40.65 |
3.87 |
32.0 |
38.00 |
40.0 |
43.00 |
53.0 |
▃▇▆▃▁ |
| CMSE |
6 |
0.97 |
46.95 |
4.18 |
37.0 |
44.00 |
46.5 |
50.00 |
58.0 |
▁▇▇▅▂ |
| CMSD |
4 |
0.98 |
46.61 |
3.95 |
37.0 |
44.00 |
46.0 |
49.62 |
56.0 |
▁▇▇▅▂ |
| CM5E |
5 |
0.97 |
53.86 |
4.18 |
38.0 |
51.50 |
54.0 |
56.00 |
65.0 |
▁▂▇▅▁ |
| CM5D |
5 |
0.97 |
53.95 |
4.17 |
38.0 |
51.25 |
54.0 |
56.50 |
64.0 |
▁▂▇▇▂ |
| CM7E |
14 |
0.92 |
43.09 |
4.28 |
33.0 |
40.00 |
42.5 |
47.00 |
58.0 |
▂▇▆▂▁ |
| CM7D |
15 |
0.92 |
43.83 |
4.33 |
33.0 |
41.00 |
44.0 |
47.00 |
55.5 |
▂▇▇▆▁ |
| CM8E |
9 |
0.95 |
29.48 |
2.63 |
21.0 |
28.00 |
29.0 |
31.25 |
36.0 |
▁▃▇▅▂ |
| CM8D |
12 |
0.93 |
29.48 |
2.67 |
21.0 |
28.00 |
29.0 |
31.12 |
36.0 |
▁▃▇▅▁ |
| CFCPCE |
5 |
0.97 |
44.58 |
3.47 |
34.0 |
42.00 |
44.5 |
47.00 |
54.0 |
▁▅▇▅▁ |
| CFAPCD |
4 |
0.98 |
45.05 |
3.62 |
34.0 |
43.00 |
45.0 |
47.00 |
56.0 |
▁▃▇▃▁ |
xbuild <- data[,-c(1:3)]
set.seed(29)
rf <- randomForest(xbuild %>% na.roughfix, proximity = TRUE)
xCoordinates <- (1 - rf$proximity) %>% cmdscale()
cl <- kmeans(xCoordinates, 6)
plot(xCoordinates, col = cl$cluster)

new_x <- rfImpute(xbuild, cl$cluster %>% factor)[,-1]
## ntree OOB 1 2 3 4 5 6
## 300: 17.78% 0.00% 11.43% 80.00% 39.13% 10.71% 13.79%
## ntree OOB 1 2 3 4 5 6
## 300: 17.22% 0.00% 11.43% 73.33% 34.78% 17.86% 10.34%
## ntree OOB 1 2 3 4 5 6
## 300: 17.78% 0.00% 14.29%100.00% 26.09% 10.71% 10.34%
## ntree OOB 1 2 3 4 5 6
## 300: 18.89% 0.00% 14.29% 80.00% 34.78% 17.86% 13.79%
## ntree OOB 1 2 3 4 5 6
## 300: 17.22% 0.00% 5.71% 80.00% 30.43% 17.86% 17.24%
# Now we do not have missing values anymore, plus mean and median remain unchanged, this a robust approach.
x_mod <- sapply(seq(2, ncol(new_x), 2), function(i) {
rowMeans(new_x[,c(i, i - 1)], na.rm = T)
})
measurements <- data.frame(x_mod)
# get variable names
selected.vars <- sapply(seq(2, ncol(new_x), 2), function(i) {
substr(colnames(new_x)[i], 0, nchar(colnames(new_x)[i])-1)
})
colnames(measurements) <- selected.vars
calctalus <- cbind(data[,1:2], measurements)
# write.csv2(calctalus, "calctalus.csv") # we could save the dataset at this point
data <- calctalus[,-1] # remove AGE, not relevant for this problem
machinedata <- data[,-1] # remove SEX, for data managing without the Y/output
datascaled <- scale(machinedata[,-1])
- Check final dataset, ready for visualization and modelling
skim(data)
Data summary
| Name |
data |
| Number of rows |
180 |
| Number of columns |
16 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
1 |
| numeric |
15 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| SEX |
0 |
1 |
FALSE |
2 |
FEM: 93, MAL: 87 |
Variable type: numeric
| TM1 |
0 |
1 |
50.71 |
3.66 |
39.25 |
48.00 |
50.38 |
53.00 |
60.75 |
▁▃▇▅▂ |
| TM2 |
0 |
1 |
39.29 |
2.92 |
31.50 |
37.25 |
39.00 |
41.50 |
46.25 |
▁▅▇▇▂ |
| TM3 |
0 |
1 |
29.13 |
2.27 |
21.25 |
27.19 |
29.00 |
30.56 |
35.50 |
▁▂▇▅▁ |
| TM3a |
0 |
1 |
30.53 |
2.34 |
22.50 |
29.00 |
30.25 |
32.25 |
37.75 |
▁▃▇▅▁ |
| TM4 |
0 |
1 |
31.25 |
2.40 |
24.00 |
29.75 |
31.00 |
33.00 |
38.75 |
▁▅▇▅▁ |
| TM5 |
0 |
1 |
26.32 |
2.31 |
21.25 |
24.50 |
26.00 |
28.00 |
32.50 |
▂▇▇▃▁ |
| CM1 |
0 |
1 |
75.50 |
5.43 |
57.00 |
72.00 |
75.29 |
78.52 |
90.50 |
▁▂▇▅▁ |
| CM1a |
0 |
1 |
70.59 |
5.19 |
53.00 |
67.19 |
70.26 |
73.56 |
86.00 |
▁▂▇▅▁ |
| CM2 |
0 |
1 |
39.76 |
3.05 |
29.25 |
37.50 |
39.50 |
42.00 |
48.00 |
▁▂▇▆▁ |
| CM4 |
0 |
1 |
40.76 |
3.81 |
32.50 |
38.00 |
40.20 |
43.56 |
53.50 |
▃▇▆▂▁ |
| CMS |
0 |
1 |
46.79 |
3.99 |
37.00 |
43.94 |
46.28 |
50.00 |
56.75 |
▁▇▇▅▂ |
| CM5 |
0 |
1 |
53.91 |
4.09 |
38.00 |
51.50 |
53.57 |
56.31 |
64.00 |
▁▂▇▇▂ |
| CM7 |
0 |
1 |
43.62 |
4.19 |
33.00 |
40.50 |
43.13 |
47.00 |
56.75 |
▂▇▇▆▁ |
| CM8 |
0 |
1 |
29.50 |
2.57 |
21.00 |
27.75 |
29.25 |
31.50 |
35.75 |
▁▂▇▅▂ |
| CFAPC |
0 |
1 |
44.82 |
3.46 |
34.00 |
42.44 |
44.75 |
46.75 |
54.00 |
▁▃▇▅▁ |
Step 2: Visualization of dataset proprieties
library(correlation)
library(ggraph)
gg <- machinedata %>%
correlation(partial = TRUE) %>%
plot()
gg + ggtitle("Gaussian Graphical Model (GGM) of CalcTalus dataset") +
scale_edge_color_viridis(option = "plasma", name = "r (partial correlation)")
## Scale for 'edge_colour' is already present. Adding another scale for
## 'edge_colour', which will replace the existing scale.

library(ggfortify)
autoplot(prcomp(machinedata, scale. = T), data = calctalus, colour = 'SEX',
loadings = T, loadings.label = T, loadings.colour = 'black') + theme_minimal()
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

ct.umap <- as.data.frame(umap(machinedata, n_components = 3, metric = "cosine",
init = "pca", spread = 24, n_epochs = 1024, min_dist = 0.5, y = data$SEX))
ct.umap$SEX <- data$SEX
library(plotly)
plot_ly(data = ct.umap, x = ~V1, y = ~V2, z = ~V3, color = ~SEX, type = 'scatter3d', mode = "markers")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Step 3: Statistical modelling
library(caret)
library(gbm)
set.seed(29)
inTraining <- createDataPartition(data$SEX, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(method = "LOOCV")
set.seed(29)
gbmFit1 <- train(SEX ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
## Stochastic Gradient Boosting
##
## 136 samples
## 15 predictor
## 2 classes: 'FEMALE', 'MALE'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## n.trees interaction.depth Accuracy Kappa
## 50 1 0.8529412 0.7053726
## 50 2 0.8676471 0.7348354
## 50 3 0.8750000 0.7494582
## 100 1 0.8602941 0.7202252
## 100 2 0.8676471 0.7346054
## 100 3 0.8750000 0.7494582
## 150 1 0.8750000 0.7496752
## 150 2 0.8676471 0.7346054
## 150 3 0.8823529 0.7640937
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
varImp(gbmFit1)
## gbm variable importance
##
## Overall
## TM4 100.000
## TM1 99.823
## CMS 69.306
## CM1a 60.232
## TM3 38.943
## CM2 37.448
## CM5 24.136
## CFAPC 11.135
## TM5 8.051
## CM8 7.137
## CM7 6.993
## CM4 4.732
## CM1 2.301
## TM2 1.602
## TM3a 0.000
confusionMatrix(predict(gbmFit1, testing), testing$SEX)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FEMALE MALE
## FEMALE 20 2
## MALE 3 19
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 3.192e-07
##
## Kappa : 0.7727
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8696
## Specificity : 0.9048
## Pos Pred Value : 0.9091
## Neg Pred Value : 0.8636
## Prevalence : 0.5227
## Detection Rate : 0.4545
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8872
##
## 'Positive' Class : FEMALE
##
rfFit <- train(SEX ~ ., data = training,
method = "rf",
trControl = fitControl,
verbose = FALSE)
rfFit
## Random Forest
##
## 136 samples
## 15 predictor
## 2 classes: 'FEMALE', 'MALE'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8602941 0.7197397
## 8 0.8455882 0.6905072
## 15 0.8235294 0.6461405
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
varImp(rfFit)
## rf variable importance
##
## Overall
## TM1 100.00
## CMS 87.94
## TM4 74.97
## TM3a 61.14
## CM8 58.13
## CM1a 56.16
## TM3 43.85
## CM4 41.83
## TM5 40.35
## CFAPC 40.19
## CM1 32.44
## TM2 31.95
## CM2 28.33
## CM7 13.60
## CM5 0.00
confusionMatrix(predict(rfFit, testing), testing$SEX)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FEMALE MALE
## FEMALE 21 1
## MALE 2 20
##
## Accuracy : 0.9318
## 95% CI : (0.8134, 0.9857)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 4.385e-09
##
## Kappa : 0.8636
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9130
## Specificity : 0.9524
## Pos Pred Value : 0.9545
## Neg Pred Value : 0.9091
## Prevalence : 0.5227
## Detection Rate : 0.4773
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9327
##
## 'Positive' Class : FEMALE
##
svmFit <- train(SEX ~ ., data = training,
method = "svmRadial",
trControl = fitControl)
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 136 samples
## 15 predictor
## 2 classes: 'FEMALE', 'MALE'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8455882 0.6899696
## 0.50 0.8676471 0.7343750
## 1.00 0.8529412 0.7051171
##
## Tuning parameter 'sigma' was held constant at a value of 0.1391779
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1391779 and C = 0.5.
confusionMatrix(predict(svmFit, testing), testing$SEX)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FEMALE MALE
## FEMALE 22 3
## MALE 1 18
##
## Accuracy : 0.9091
## 95% CI : (0.7833, 0.9747)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 4.23e-08
##
## Kappa : 0.817
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9565
## Specificity : 0.8571
## Pos Pred Value : 0.8800
## Neg Pred Value : 0.9474
## Prevalence : 0.5227
## Detection Rate : 0.5000
## Detection Prevalence : 0.5682
## Balanced Accuracy : 0.9068
##
## 'Positive' Class : FEMALE
##
Supervised umap modeling
library(vizier)
set.seed(29)
train_umap <- umap(training, y = training$SEX, verbose = TRUE, n_components = 3, spread = 8, ret_model = TRUE)
## 13:50:53 UMAP embedding parameters a = 0.07163 b = 0.7918
## 13:50:53 Read 136 rows and found 15 numeric columns
## 13:50:53 Using Annoy for neighbor search, n_neighbors = 15
## 13:50:53 Building Annoy index with metric = euclidean, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:50:53 Writing NN index file to temp file C:\Users\delvi\AppData\Local\Temp\RtmpaACilG\file422c594f7e4c
## 13:50:53 Searching Annoy index using 4 threads, search_k = 1500
## 13:50:54 Annoy recall = 100%
## 13:50:54 Commencing smooth kNN distance calibration using 4 threads
## 13:50:54 Processing y data
## 13:50:54 Carrying out categorical intersection for 1 column
## 13:50:54 Applying categorical set intersection, weight = 0.5 far distance = 5
## 13:50:54 Initializing from normalized Laplacian + noise
## 13:50:54 Commencing optimization for 500 epochs, with 2394 positive edges
## 13:50:54 Optimization finished
vizier::embed_plotly(train_umap$embedding, as.factor(training$SEX))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
set.seed(29)
test_umap <- umap_transform(testing, train_umap, verbose = TRUE)
## 13:50:55 Read 44 rows and found 15 numeric columns
## 13:50:55 Processing block 1 of 1
## 13:50:55 Writing NN index file to temp file C:\Users\delvi\AppData\Local\Temp\RtmpaACilG\file422c77ae63fd
## 13:50:55 Searching Annoy index using 4 threads, search_k = 1500
## 13:50:55 Commencing smooth kNN distance calibration using 4 threads
## 13:50:55 Initializing by weighted average of neighbor coordinates using 4 threads
## 13:50:55 Commencing optimization for 167 epochs, with 660 positive edges
## 13:50:55 Finished
vizier::embed_plotly(test_umap, as.factor(testing$SEX))
training2 <- as.data.frame(train_umap$embedding)
training2$SEX <- training$SEX
set.seed(29)
gbmFit2 <- train(SEX ~ ., data = training2,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit2
## Stochastic Gradient Boosting
##
## 136 samples
## 3 predictor
## 2 classes: 'FEMALE', 'MALE'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## n.trees interaction.depth Accuracy Kappa
## 50 1 1 1
## 50 2 1 1
## 50 3 1 1
## 100 1 1 1
## 100 2 1 1
## 100 3 1 1
## 150 1 1 1
## 150 2 1 1
## 150 3 1 1
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
varImp(gbmFit2)
## gbm variable importance
##
## Overall
## V1 100
## V3 0
## V2 0
confusionMatrix(predict(gbmFit2, test_umap), testing$SEX)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FEMALE MALE
## FEMALE 23 2
## MALE 0 19
##
## Accuracy : 0.9545
## 95% CI : (0.8453, 0.9944)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 3.335e-10
##
## Kappa : 0.9085
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9048
## Pos Pred Value : 0.9200
## Neg Pred Value : 1.0000
## Prevalence : 0.5227
## Detection Rate : 0.5227
## Detection Prevalence : 0.5682
## Balanced Accuracy : 0.9524
##
## 'Positive' Class : FEMALE
##
rfFit2 <- train(SEX ~ ., data = training2,
method = "rf",
trControl = fitControl,
verbose = FALSE)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
rfFit2
## Random Forest
##
## 136 samples
## 3 predictor
## 2 classes: 'FEMALE', 'MALE'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 1 1
## 3 1 1
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
varImp(rfFit2)
## rf variable importance
##
## Overall
## V1 100.0
## V3 14.4
## V2 0.0
confusionMatrix(predict(rfFit2, test_umap), testing$SEX)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FEMALE MALE
## FEMALE 23 2
## MALE 0 19
##
## Accuracy : 0.9545
## 95% CI : (0.8453, 0.9944)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 3.335e-10
##
## Kappa : 0.9085
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9048
## Pos Pred Value : 0.9200
## Neg Pred Value : 1.0000
## Prevalence : 0.5227
## Detection Rate : 0.5227
## Detection Prevalence : 0.5682
## Balanced Accuracy : 0.9524
##
## 'Positive' Class : FEMALE
##
svmFit2 <- train(SEX ~ ., data = training2,
method = "svmRadial",
trControl = fitControl)
svmFit2
## Support Vector Machines with Radial Basis Function Kernel
##
## 136 samples
## 3 predictor
## 2 classes: 'FEMALE', 'MALE'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 1 1
## 0.50 1 1
## 1.00 1 1
##
## Tuning parameter 'sigma' was held constant at a value of 0.9823658
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.9823658 and C = 0.25.
varImp(svmFit2)
## ROC curve variable importance
##
## Importance
## V1 100.000
## V3 5.291
## V2 0.000
confusionMatrix(predict(svmFit2, test_umap), testing$SEX)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FEMALE MALE
## FEMALE 23 3
## MALE 0 18
##
## Accuracy : 0.9318
## 95% CI : (0.8134, 0.9857)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 4.385e-09
##
## Kappa : 0.8625
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 1.0000
## Specificity : 0.8571
## Pos Pred Value : 0.8846
## Neg Pred Value : 1.0000
## Prevalence : 0.5227
## Detection Rate : 0.5227
## Detection Prevalence : 0.5909
## Balanced Accuracy : 0.9286
##
## 'Positive' Class : FEMALE
##